library("tidyverse")
library("patchwork")
library("viridis")
library("ggrepel")
library("DT")

0. Read datasets

img_details <- read_csv("data_emilia/img_details.csv", show_col_types = FALSE)
asta_samples <- read_csv("data_emilia/asta_samples.csv", show_col_types = FALSE)
red_ind_final <- read_csv("data_emilia/red_ind_final.csv", show_col_types = FALSE) %>% rename(image_name = img_name)
img_details <- img_details %>% left_join(red_ind_final)

Dataset for all images (699 images) :

red_index_by_image <- img_details %>% 
  group_by(sample_name) %>% 
  mutate(sample_redness = mean(redness_index), sample_red_pixels = mean(red_pixels_nb)) %>% 
  left_join(select(asta_samples, sample_name,asta_ng_mg,mass_mg), by = "sample_name") %>% 
  mutate(asta_ng = asta_ng_mg*mass_mg)

datatable(red_index_by_image, rownames = FALSE, filter="top", options = list(pageLength = 5, scrollX=T))

Sample level (70 samples) :

red_index_by_sample <- img_details %>% 
  group_by(sample_name) %>% 
  summarise(sample_redness = mean(redness_index), sample_red_pixels = mean (red_pixels_nb)) %>% 
  left_join(asta_samples, by = "sample_name") %>% 
  mutate(asta_ng = asta_ng_mg*mass_mg)
datatable(red_index_by_sample, rownames = FALSE, filter="top", options = list(pageLength = 5, scrollX=T))


1. Path to images and images selection

Add path to images in the detailed dataset :

# Paths in
red_index_by_image$path_in_original = str_c("~/complex/ecoplast_clean_dataset/1.Original/",red_index_by_image$image_name)
red_index_by_image$path_in_calib = str_c("~/complex/ecoplast_clean_dataset/2.Calibrated/",red_index_by_image$image_name)
red_index_by_image$path_in_index = str_c("~/complex/ecoplast_clean_dataset/4.Red_pixels_highlighted/",red_index_by_image$image_name)
red_index_by_image$path_in_mask = str_c("~/complex/ecoplast_clean_dataset/3.Red_pixels_masked/",red_index_by_image$image_name)

# Paths out
red_index_by_image$path_out_original = str_c("~/complex/ecoplast_clean_dataset/results/sample_to_show/",red_index_by_image$image_name, "original.tif")
red_index_by_image$path_out_calib = str_c("~/complex/ecoplast_clean_dataset/results/sample_to_show/",red_index_by_image$image_name, "calib.tif")
red_index_by_image$path_out_index = str_c("~/complex/ecoplast_clean_dataset/results/sample_to_show/",red_index_by_image$image_name, "index.tif")
red_index_by_image$path_out_mask = str_c("~/complex/ecoplast_clean_dataset/results/sample_to_show/",red_index_by_image$image_name, "mask.tif")

We want to select ten images of Figure 3.1, including three images with a known redness gradient for Fig. 3.3. The seven others are chosen randomly.

selection3 <- red_index_by_image %>% filter(image_name %in% c("2018_G_p85_5.tif", "2018_AR_p22_4.tif", "2018_AR_p21_5.tif"))
#sample7 <- sample(x = 1:699, size = 7)
#sample7_to_show <- red_index_by_image[sample7,]
#write_csv(sample7_to_show, "data_emilia/sample7_to_show.csv")
sample7_to_show <- read_csv("data_emilia/sample7_to_show.csv", show_col_types = FALSE)
sample7_to_show$path_in_original = str_c("~/complex/ecoplast_clean_dataset/1.Original/",sample7_to_show$image_name)
sample7_to_show$path_out_original = str_c("~/complex/ecoplast_clean_dataset/results/sample_to_show/",sample7_to_show$image_name, "original.tif")
# setdiff(sample7_to_show$image_name, selection3$image_name)
# -> ok they are all different


Representation of selected images on the relationship with raw data.

ggplot(red_index_by_image)+
  # all images
  geom_point(aes(x = asta_ng, y = redness_index), color = "grey50", alpha = 0.7, size = 0.5)+
  # averages and smooth
  geom_point(aes(x = asta_ng, y = sample_redness), color = "#01665e", size = 1.5)+
  geom_smooth(aes(x = asta_ng, y = sample_redness), color = "#01665e", linewidth = 0.5, se= F)+
  # selected 3 images
  geom_point(aes(x = asta_ng, y = redness_index), color = "#d94801", alpha = 0.7, size = 1, data = selection3)+
  geom_text_repel(aes(x = asta_ng, y = redness_index, label = image_name), color = "#d94801", data = selection3, size =3, max.overlaps = 15)+
  # random sample of seven images
  geom_point(aes(x = asta_ng, y = redness_index), color = "#d94801", alpha = 0.7, size = 1, data = sample7_to_show)+
  geom_text_repel(aes(x = asta_ng, y = redness_index, label = image_name), color = "#d94801", data = sample7_to_show, size =3, max.overlaps = 15)+
  labs(x= "Total astaxanthin content (10 individuals) (ng)", y ="Redness indices (averages for 10 images in red)")+
  theme_bw(base_size=10)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'


Final sample of 10 images associated with the image name on Figure 3.1.

sample7_to_show <- mutate(sample7_to_show, graphid = "sample7")
selection3 <- mutate(selection3, graphid = "selection3")
sample_to_show <- bind_rows(sample7_to_show, selection3)

image_letters <- read_delim("data_emilia/image_letters.csv", 
    delim = ";", escape_double = FALSE, trim_ws = TRUE, show_col_types = FALSE)

sample_to_show  <- sample_to_show  %>% left_join(image_letters ) %>%  select(image_name, image_letter, everything())
datatable(sample_to_show, rownames = FALSE, filter="top", options = list(pageLength = 3, scrollX=T))



2. Tests on data transformations and representation :

Log (ln) transformation and initial (intuitive) representation of RI = f(TAC) :

a <- ggplot(red_index_by_image)+
  # all images
  geom_point(aes(y = asta_ng,x = redness_index), color = "grey60", alpha = 0.7, size = 1)+
  # averages and smooth
  geom_point(aes(y = asta_ng, x = sample_redness), color = "#01665e", size = 2)+
  #geom_smooth(aes(x = asta_ng, y = sample_redness_log), color = "grey50", size = 0.5, se= F)+
  geom_smooth(aes(y = asta_ng, x = sample_redness), method = "lm", formula = 'y ~ x',color = "#01665e", linewidth = 0.5, se= F)+
  # print selected image
  geom_point(aes(y = asta_ng, x = redness_index), color = "#d94801", size = 1.5, data = sample_to_show)+
  geom_text_repel(aes(y = asta_ng, x = redness_index, label = image_letter), color = "#d94801", data = sample_to_show, size =4, max.overlaps = 15)+
  labs(y= "Total astaxanthin content (TAC) (ng) (log axis)", x ="Redness indices (log axis)")+
  theme_bw(base_size=12) + 
  scale_y_continuous(trans = "log")+ 
  scale_x_continuous(trans = "log")


b <- ggplot(red_index_by_image)+
  # all images
  geom_point(aes(x = asta_ng, y = redness_index), color = "grey60", alpha = 0.7, size = 1)+
  # averages and smooth
  geom_point(aes(x = asta_ng, y = sample_redness), color = "#01665e", size = 2)+
  #geom_smooth(aes(x = asta_ng, y = sample_redness_log), color = "grey50", size = 0.5, se= F)+
  geom_smooth(aes(x = asta_ng, y = sample_redness), method = "lm", formula = 'y ~ x',color = "#01665e", linewidth = 0.5, se= F)+
  # print selected image
  geom_point(aes(x = asta_ng, y = redness_index), color = "#d94801", size = 1.5, data = sample_to_show)+
  geom_text_repel(aes(x = asta_ng, y = redness_index, label = image_letter), color = "#d94801", data = sample_to_show, size =4, max.overlaps = 15)+
  labs(x= "Total astaxanthin content (TAC) (ng) (log axis)", y ="Redness indices (log axis)")+
  theme_bw(base_size=12) + 
  scale_y_continuous(trans = "log")+ 
  scale_x_continuous(trans = "log")

a + b

Large improvement with log-log represention.

Test to check results of various regression types:

1) Linear model without log

  1. TAC = f(RI)
    -> pvalue < 0.001, Adjusted R squared = 35%
summary(lm(asta_ng ~ sample_redness, data = red_index_by_sample))
## 
## Call:
## lm(formula = asta_ng ~ sample_redness, data = red_index_by_sample)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -12.600  -4.829  -2.390   2.170  32.720 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    4.838617   1.675392   2.888  0.00519 ** 
## sample_redness 0.046039   0.007382   6.237 3.26e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.663 on 68 degrees of freedom
## Multiple R-squared:  0.3639, Adjusted R-squared:  0.3545 
## F-statistic:  38.9 on 1 and 68 DF,  p-value: 3.263e-08
# 35%
  1. RI = f(TAC)
    -> pvalue < 0.001, same Adjusted R squared = 35%
summary(lm(sample_redness ~ asta_ng, data = red_index_by_sample))
## 
## Call:
## lm(formula = sample_redness ~ asta_ng, data = red_index_by_sample)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -228.13  -68.28  -40.67   99.31  234.92 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   75.258     21.394   3.518 0.000781 ***
## asta_ng        7.904      1.267   6.237 3.26e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 113.5 on 68 degrees of freedom
## Multiple R-squared:  0.3639, Adjusted R-squared:  0.3545 
## F-statistic:  38.9 on 1 and 68 DF,  p-value: 3.263e-08
# 35%



2) Linear model with log

  1. ln(TAC) = f(ln(RI))
    -> p-value < 0.001, Adjusted R-squared = 0.599, slope (b) = 0.83, intercept (ln(a)) = -1.82 so a = 0.1620258
# Log(ln) transformation 
summary(lm(log(asta_ng) ~ log(sample_redness), data = red_index_by_sample))
## 
## Call:
## lm(formula = log(asta_ng) ~ log(sample_redness), data = red_index_by_sample)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.77473 -0.46826 -0.03245  0.37502  1.70927 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -1.81878    0.39543    -4.6  1.9e-05 ***
## log(sample_redness)  0.83076    0.08143    10.2  2.4e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7089 on 68 degrees of freedom
## Multiple R-squared:  0.6048, Adjusted R-squared:  0.599 
## F-statistic: 104.1 on 1 and 68 DF,  p-value: 2.396e-15
# 60%
lm_TAC_RI <- lm(log(asta_ng) ~ log(sample_redness), data = red_index_by_sample)
opar <- par(mfrow = c(2,2), oma = c(0, 0, 1.1, 0))
plot(lm_TAC_RI , las = 1)      # Residuals, Fitted, ...

par(opar)
  1. ln(RI) = f(ln(TAC))
    -> p-value < 0.001, Adjusted R-squared = 0.599, slope (b) = 0.73, intercept (ln(a)) = 3.20, so a = 24.53253
summary(lm(log(sample_redness) ~ log(asta_ng), data = red_index_by_sample))
## 
## Call:
## lm(formula = log(sample_redness) ~ log(asta_ng), data = red_index_by_sample)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.52924 -0.41686  0.05062  0.49078  1.22072 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.19852    0.17093   18.71  < 2e-16 ***
## log(asta_ng)  0.72806    0.07136   10.20  2.4e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6636 on 68 degrees of freedom
## Multiple R-squared:  0.6048, Adjusted R-squared:  0.599 
## F-statistic: 104.1 on 1 and 68 DF,  p-value: 2.396e-15
lm_RI_TAC <- lm(log(sample_redness) ~ log(asta_ng), data = red_index_by_sample)
opar <- par(mfrow = c(2,2), oma = c(0, 0, 1.1, 0))
plot(lm_RI_TAC , las = 1)      # Residuals, Fitted, ...

par(opar)

3) Linear model with log on all images

  1. ln(TAC) = f(ln(RIimages))
    -> p-value < 0.001, Adjusted R-squared = 0.4558, slope (b) = 0.59, intercept (ln(a)) = -0.62
    -> more variability, R2 less important
# Ln transformation + with all images and no averages
summary(lm(log(asta_ng) ~ log(redness_index), data = red_index_by_image))
## 
## Call:
## lm(formula = log(asta_ng) ~ log(redness_index), data = red_index_by_image)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.0743 -0.4657 -0.0073  0.4610  3.2592 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -0.62011    0.11738  -5.283  1.7e-07 ***
## log(redness_index)  0.59632    0.02464  24.201  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.82 on 697 degrees of freedom
## Multiple R-squared:  0.4566, Adjusted R-squared:  0.4558 
## F-statistic: 585.7 on 1 and 697 DF,  p-value: < 2.2e-16
# 46% -> more variability 

4) Linear model with number of red pixels instead of RI ln(TAC) = f(ln(red_pix))
-> p-value < 0.001, Adjusted R-squared = 0.5821, slope (b) = 0.94, intercept (ln(a)) = -1.17
Linear model test :

red_pix_sample <-
  red_index_by_image %>%
  group_by(sample_name) %>% 
  summarise(asta_ng = mean(asta_ng), red_pixels_nb = mean(red_pixels_nb))

summary(lm(log(asta_ng) ~ log(red_pixels_nb), data = red_pix_sample))
## 
## Call:
## lm(formula = log(asta_ng) ~ log(red_pixels_nb), data = red_pix_sample)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.77191 -0.44083  0.01075  0.34020  1.54881 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -1.17789    0.34584  -3.406  0.00111 ** 
## log(red_pixels_nb)  0.94703    0.09611   9.854 9.92e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7237 on 68 degrees of freedom
## Multiple R-squared:  0.5881, Adjusted R-squared:  0.5821 
## F-statistic:  97.1 on 1 and 68 DF,  p-value: 9.924e-15


  • The best models are indeed the log-log relationship between redness indices at the sample level and HPLC quantification. The use of redness indices instead of redness pixels numbers doens not improve the relationship much (1.7%), but better than nothing.
  • Diagnostic graphs are not perfect but quite good: residuals looks normal, leverage is correct (not too close to cook’s distance). I found them a little bit better for RI = f(TAC)
  • The sense of the relationship does not affect the R-squared, but affects the slope and produce different outliers in each case. I found difficult to choose one representation over another : TAC=f(RI) is better to find the astaxanthin content. Also, we said that the error of HPLC might be larger than the one from imaging analysis. But in fact, because the dataset is constructed with 1 HPLC point for 10 redness indices, the variation is only visible (and large) for RI. Thus, it might be coherent to represent RI= f(TAC). In Vogedes et al (2010), they did not have the same issue because they had 1 lipid quantification for 1 image. I still don’t know what representation to choose: what do you think ? Is it interesting to put both ?.



Density distribution of orginal data (log transformed):

Want to check distribution of original data.

# Image redness indices
a <- ggplot(red_index_by_image)+
  geom_density(aes(x = log(redness_index)), color = "black")+
  labs(title="Image redness indices")

# Sample redness indices
b <- ggplot(red_index_by_image)+
  geom_density(aes(x = log(sample_redness)), color = "black")+
  labs(title="Sample redness indices")

# HPLC astaxanthin
c <- ggplot(red_index_by_image)+
  geom_density(aes(x = log(asta_ng)), color = "black")+
  labs(title="HPLC (ng)")

a + b + c

Distribution is clearly not normal, but bi-modal. We can suppose an influence of the year, and/or a “real” bi-modal distribution of red vs not red copepods (between C.finmarchicus and C.glacialis, or between two water masses for example?).


Summary redness index for all images :

summary(red_index_by_image$redness_index)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##    0.6108   38.5207  116.9253  178.0560  307.8942 1046.6369



Summary redness for samples :

summary(red_index_by_image$sample_redness)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   10.53   44.83  131.24  178.06  322.14  442.70



Summary HPLC for samples :

summary(red_index_by_image$asta_ng)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.3435  4.0698 10.7422 13.0184 18.2080 47.9790


3. Influence of the year

Add year as the shape of points on the graph.

a <- ggplot(red_index_by_image)+
  # all images
  geom_point(aes(x = asta_ng, y = redness_index, shape = factor(year)), color = "grey60", alpha = 0.7, size = 1)+
  # averages and smooth
  geom_point(aes(x = asta_ng, y = sample_redness, shape = factor(year)), color = "#01665e", size = 2)+
  #geom_smooth(aes(x = asta_ng, y = sample_redness_log), color = "grey50", size = 0.5, se= F)+
  geom_smooth(aes(x = asta_ng, y = sample_redness), method = "lm", color = "#01665e", formula = 'y ~ x',linewidth = 0.5, se= F)+
  # print selected image
  geom_point(aes(x = asta_ng, y = redness_index, shape = factor(year)), color = "#d94801", size = 1.5, data = sample_to_show)+
  geom_text_repel(aes(x = asta_ng, y = redness_index, label = image_letter), color = "#d94801", data = sample_to_show, size = 4, max.overlaps = 15)+
  labs(x= "Total astaxanthin content (TAC) (ng) (log axis)", y ="Redness indices (log axis)")+
  theme_bw(base_size=12) + theme(legend.position = "none") +
  scale_y_continuous(trans = "log")+ 
  scale_x_continuous(trans = "log")

b <- ggplot(red_index_by_image)+
  # all images
  geom_point(aes(y = asta_ng,x = redness_index, shape = factor(year)), color = "grey60", alpha = 0.7, size = 1)+
  # averages and smooth
  geom_point(aes(y = asta_ng, x = sample_redness, shape = factor(year)), color = "#01665e", size = 2)+
  #geom_smooth(aes(x = asta_ng, y = sample_redness_log), color = "grey50", size = 0.5, se= F)+
  geom_smooth(aes(y = asta_ng, x = sample_redness), method = "lm", color = "#01665e", formula = 'y ~ x',linewidth = 0.5, se= F)+
  # print selected image
  geom_point(aes(y = asta_ng, x = redness_index, shape = factor(year)), color = "#d94801", size = 1.5, data = sample_to_show)+
  geom_text_repel(aes(y = asta_ng, x = redness_index, label = image_letter), color = "#d94801", data = sample_to_show, size =4, max.overlaps = 15)+
  labs(y= "Total astaxanthin content (TAC) (ng) (log axis)", x ="Redness indices (log axis)", shape = "Year")+
  theme_bw(base_size=12) + 
  scale_y_continuous(trans = "log")+ 
  scale_x_continuous(trans = "log")

a + b

ggplot(red_index_by_image)+
  # all images
  geom_point(aes(y = asta_ng,x = redness_index,shape = factor(year)), color = "grey60", alpha = 0.7, size = 1)+
  # averages and smooth
  geom_point(aes(y = asta_ng, x = sample_redness,shape = factor(year)), color = "#01665e", size = 2)+
  #geom_smooth(aes(x = asta_ng, y = sample_redness_log), color = "grey50", size = 0.5, se= F)+
  geom_smooth(aes(y = asta_ng, x = sample_redness), method = "lm", formula = 'y ~ x',color = "#01665e", linewidth = 0.5, se= F)+
  # print selected image
  geom_point(aes(y = asta_ng, x = redness_index,shape = factor(year)), color = "#d94801", size = 1.5, data = sample_to_show)+
  geom_text_repel(aes(y = asta_ng, x = redness_index, label = image_letter), color = "#d94801", data = sample_to_show, size =6, max.overlaps = 15)+
  labs(y= "TAC (ng) (log-transformed)", x ="Redness indices (mm2) (log-transformed)", shape ="Year")+
  theme_bw(base_size=14) + 
  scale_y_continuous(trans = "log", labels = scales::number_format(accuracy = 0.01))+ 
  scale_x_continuous(trans = "log", labels = scales::number_format(accuracy = 0.01))

ggsave("figures/HPLC_redness_log.pdf", width=9, height =5)

We can see that points are often, but not systemically, grouped by year.

Density distribution of orginal data (log transformed) by YEAR

# Image redness indices
a <- ggplot()+
  geom_density(aes(x = log(redness_index)), color = "black", data = red_index_by_image, size = 1.5)+
  geom_density(aes(x = log(redness_index)), data = filter(red_index_by_image, year == 2018), color = "violetred")+
  geom_density(aes(x = log(redness_index)), data = filter(red_index_by_image, year == 2019), color = "yellowgreen")+
  labs(title="Image redness indices")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
# Sample redness indices
b <- ggplot(red_index_by_image)+
  geom_density(aes(x = log(sample_redness)),color = "black", data = red_index_by_image, size = 1.5)+
  geom_density(aes(x = log(sample_redness)), data = filter(red_index_by_image, year == 2018), color = "violetred")+
  geom_density(aes(x = log(sample_redness)), data = filter(red_index_by_image, year == 2019), color = "yellowgreen")+
  labs(title="Sample redness indices")

# HPLC astaxanthin
c <- ggplot(red_index_by_image)+
  geom_density(aes(x = log(asta_ng)), color = "black", data = red_index_by_image, size = 1.5)+
  geom_density(aes(x = log(asta_ng)), data = filter(red_index_by_image, year == 2018), color = "violetred")+
  geom_density(aes(x = log(asta_ng)), data = filter(red_index_by_image, year == 2019), color = "yellowgreen")+
  labs(title="HPLC (ng)")

a + b + c

Black = both years, Green = 2019, Purple = 2018

Obvious bi-modal distribution influenced by the year (copepods less red in 2018), coherent between HPLC and imaging methods. We can also see a small bi-modal distribution within year 2018. Is it a problem ? I think not if residuals are normal, which is the case (see next paragraph).



Density distribution of residuals with the two best models

a <- ggplot()+
  geom_density(aes(x = lm_RI_TAC$residuals), color = "darkblue")
shapiro.test(lm_RI_TAC$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  lm_RI_TAC$residuals
## W = 0.97208, p-value = 0.1195
b <- ggplot()+
  geom_density(aes(x = lm_TAC_RI$residuals), color = "darkblue")
shapiro.test(lm_TAC_RI$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  lm_TAC_RI$residuals
## W = 0.98049, p-value = 0.3454
a+b

Residuals are normally distributed in both senses of the relationship.



5. Do we have to remove outliers ?


I gathered two types of outliers in the models :
- maximum leverage = max Cook distance : green
- maximum residuals (based on diagnostic graphs) : blue

leverage <- data.frame(lev_TAC_RI = hatvalues(lm_TAC_RI), lev_RI_TAC = hatvalues(lm_RI_TAC))
red_index_by_sample <- bind_cols(red_index_by_sample, leverage)



For ln(TAC) = f(ln(RI))

outliers_lev_TAC_RI <- red_index_by_sample %>% filter(lev_TAC_RI > 0.04)
outliers_res_TAC_RI <- red_index_by_sample %>% filter(rownames(red_index_by_sample) %in% c("16", "20", "41"))

ggplot(red_index_by_image)+
  # all images
  geom_point(aes(y = asta_ng,x = redness_index, shape = factor(year)), color = "grey60", alpha = 0.7, size = 1)+
  # averages and smooth
  geom_point(aes(y = asta_ng, x = sample_redness, shape = factor(year)), color = "#01665e", size = 2)+
  #geom_smooth(aes(x = asta_ng, y = sample_redness_log), color = "grey50", size = 0.5, se= F)+
  geom_smooth(aes(y = asta_ng, x = sample_redness), method = "lm", color = "#01665e", formula = 'y ~ x',size = 0.5, se= F)+
  # print selected image
  geom_point(aes(y = asta_ng, x = redness_index, shape = factor(year)), color = "#d94801", size = 1.5, data = sample_to_show)+
  geom_text_repel(aes(y = asta_ng, x = redness_index, label = image_letter), color = "#d94801", data = sample_to_show, size =6, max.overlaps = 15)+
  # outliers 
  geom_point(aes(y = asta_ng, x = sample_redness), shape = "x", color = "green", size = 4, data = outliers_lev_TAC_RI)+
  geom_point(aes(y = asta_ng, x = sample_redness), shape = "x", color = "blue", size = 4, data = outliers_res_TAC_RI)+
  #labs etc
  labs(y= "Total astaxanthin content (TAC) (ng) (log axis)", x ="Redness indices (log axis)")+
  theme_bw(base_size=10) + 
  scale_y_continuous(trans = "log")+ 
  scale_x_continuous(trans = "log")

For ln(RI) = f(ln(TAC))

outliers_lev_RI_TAC <- red_index_by_sample %>% filter(lev_RI_TAC > 0.045)
outliers_res_RI_TAC <- red_index_by_sample %>% filter(rownames(red_index_by_sample) %in% c("16", "20", "21"))

ggplot(red_index_by_image)+
  # all images
  geom_point(aes(x = asta_ng, y = redness_index, shape = factor(year)), color = "grey60", alpha = 0.7, size = 1)+
  # averages and smooth
  geom_point(aes(x = asta_ng, y = sample_redness, shape = factor(year)), color = "#01665e", size = 2)+
  #geom_smooth(aes(x = asta_ng, y = sample_redness_log), color = "grey50", size = 0.5, se= F)+
  geom_smooth(aes(x = asta_ng, y = sample_redness), method = "lm", color = "#01665e", formula = 'y ~ x',size = 0.5, se= F)+
  # print selected image
  geom_point(aes(x = asta_ng, y = redness_index, shape = factor(year)), color = "#d94801", size = 1.5, data = sample_to_show)+
  geom_text_repel(aes(x = asta_ng, y = redness_index, label = image_letter), color = "#d94801", data = sample_to_show, size =6, max.overlaps = 15)+
  # outliers 
  geom_point(aes(x = asta_ng, y = sample_redness), shape = "x", color = "green", size = 4, data = outliers_lev_RI_TAC)+
  geom_point(aes(x = asta_ng, y = sample_redness), shape = "x", color = "blue", size = 4, data = outliers_res_RI_TAC)+
  #labs etc
  labs(x= "Total astaxanthin content (TAC) (ng) (log axis)", y ="Redness indices (log axis)")+
  theme_bw(base_size=10) + 
  scale_y_continuous(trans = "log")+ 
  scale_x_continuous(trans = "log")




  • Points with a high leverage belongs to the few samples with really low redness/astaxanthin content. However, we don’t have so many reasons to remove them as they provide an important information, and do not overpass the Cook’s distance.
  • Points with large residuals belongs to the “weird” samples that I identified (grey copepods with broken lipid sacs). Indeed, they are quite extreme, but residuals are still normally distributed. We could remove them to only if we have an ecological/methodological reasons of doing so, which has to be discussed with Emilia. Afterwards, we show that remving them allow to go from 60% to 74% in R-squared. But again, it has to be done only with a strong reason.


red_index_by_image2 <- red_index_by_image %>% filter(!(sample_name %in% c("AT_p116", "AT_p117", "AT_p118", "AT_p119", "AT_p120", "AT_p105")))
red_index_by_sample2 <- red_index_by_sample %>% filter(!(sample_name %in% c("AT_p116", "AT_p117", "AT_p118", "AT_p119", "AT_p120", "AT_p105")))
a <- ggplot(red_index_by_image2)+
  # all images
  geom_point(aes(y = asta_ng,x = redness_index, shape = factor(year)), color = "grey60", alpha = 0.7, size = 1)+
  # averages and smooth
  geom_point(aes(y = asta_ng, x = sample_redness, shape = factor(year)), color = "#01665e", size = 2)+
  #geom_smooth(aes(x = asta_ng, y = sample_redness_log), color = "grey50", size = 0.5, se= F)+
  geom_smooth(aes(y = asta_ng, x = sample_redness), method = "lm", color = "#01665e", formula = 'y ~ x',size = 0.5, se= F)+
  # print selected image
  geom_point(aes(y = asta_ng, x = redness_index, shape = factor(year)), color = "#d94801", size = 1.5, data = sample_to_show)+
  geom_text_repel(aes(y = asta_ng, x = redness_index, label = image_letter), color = "#d94801", data = sample_to_show, size =4, max.overlaps = 15)+
  #labs etc
  labs(y= "Total astaxanthin content (TAC) (ng) (log axis)", x ="Redness indices (log axis)", shape = "Year")+
  theme_bw(base_size=12) + 
  scale_y_continuous(trans = "log")+
  scale_x_continuous(trans = "log")

b<- ggplot(red_index_by_image2)+
  # all images
  geom_point(aes(x = asta_ng, y = redness_index, shape = factor(year)), color = "grey60", alpha = 0.7, size = 1)+
  # averages and smooth
  geom_point(aes(x = asta_ng, y = sample_redness, shape = factor(year)), color = "#01665e", size = 2)+
  #geom_smooth(aes(x = asta_ng, y = sample_redness_log), color = "grey50", size = 0.5, se= F)+
  geom_smooth(aes(x = asta_ng, y = sample_redness), method = "lm", color = "#01665e", formula = 'y ~ x', size = 0.5, se= F)+
  # print selected image
  geom_point(aes(x = asta_ng, y = redness_index, shape = factor(year)), color = "#d94801", size = 1.5, data = sample_to_show)+
  geom_text_repel(aes(x = asta_ng, y = redness_index, label = image_letter), color = "#d94801", data = sample_to_show, size =4, max.overlaps = 15)+
  labs(x= "Total astaxanthin content (TAC) (ng) (log axis)", y ="Redness indices (log axis)", shape = "Year")+
  theme_bw(base_size=12) + 
  scale_y_continuous(trans = "log")+ 
  scale_x_continuous(trans = "log")

a+b

ln(TAC) = f(ln(RI))
-> p-value < 0.001, Adjusted R-squared = 0.74, slope (b) = 0.91, intercept (ln(a)) = -2.64 so a = 0.096
ln(RI) = f(ln(TAC))
-> p-value < 0.001, Adjusted R-squared = 0.7405, slope (b) = 0.81, intercept (ln(a)) = 3.13, so a = 22.8

summary(lm(log(asta_ng) ~ log(sample_redness), data = red_index_by_sample2))
## 
## Call:
## lm(formula = log(asta_ng) ~ log(sample_redness), data = red_index_by_sample2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.50193 -0.40344  0.02598  0.36584  1.43693 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -2.34663    0.33426   -7.02 1.98e-09 ***
## log(sample_redness)  0.91361    0.06795   13.45  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5787 on 62 degrees of freedom
## Multiple R-squared:  0.7446, Adjusted R-squared:  0.7405 
## F-statistic: 180.8 on 1 and 62 DF,  p-value: < 2.2e-16
summary(lm(log(sample_redness) ~ log(asta_ng), data = red_index_by_sample2))
## 
## Call:
## lm(formula = log(sample_redness) ~ log(asta_ng), data = red_index_by_sample2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.62971 -0.38337 -0.00697  0.34627  1.13108 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.13910    0.14135   22.21   <2e-16 ***
## log(asta_ng)  0.81506    0.06062   13.45   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5466 on 62 degrees of freedom
## Multiple R-squared:  0.7446, Adjusted R-squared:  0.7405 
## F-statistic: 180.8 on 1 and 62 DF,  p-value: < 2.2e-16